requests 模块的介绍
1. requests 模块的介绍
- 爬虫的基本框架是获取HTML页面信息、解析页面信息、保存结果,而 requests 模块式用于第一步获取页面信息
- requests 模块一般用于提交网络请求 和 爬取 HTML 页面
- requests 其实是在 urllib 模块上再做了一层封装
2. requests 和 urllib 的对比
- 使用requests模块:
- 自动处理url编码
- 自动处理post请求参数
- 简化cookie和代理操作
- ......
- 使用urllib:
- 手动处理url编码
- 手动处理post请求参数
- 处理cookie和代理操作繁琐
- ......
2.requests 模块的模拟上网的流程
- 指定url
- 发起请求
- 获取响应对象中的数据
- 持久化存储(即: 保存爬取到的数据)
import requests
# 1. 指定url
url = 'https://www.sogou.com/'
# 2. 发起请求
response = requests.get(url=url)
# 3. 获取响应对象中的数据
page_text = response.text
# 4. 持久化存储(即: 保存爬取到的数据)
with open('./sogou.html', 'w', encoding='utf-8') as f:
f.write(page_text)
4.使用 requests 模块的注意事项
- 在使用 requests 模块的时候都要设置 User-Agent 请求头,为了防止反爬机制中的 User-Agent 检测(即: UA检测)
requests 模块的使用
1. requests 模块的请求方式
import requests
requests.get() # 通过get请求获取页面
requests.post() # 通过post请求获取页面
requests.put() # 通过put请求获取页面
requests.patch() # 通过patch请求获取页面
requests.delete() # 通过delete请求获取页面
requests.head() # 获取HTML网页头信息
2. requests 模块的参数
- url -> 对指定的url发起请求
import requests
url = 'https://www.sogou.com/'
response = requests.get(url=url)
- param -> 指定get请求的参数
import requests
url = 'https://www.sogou.com/web'
param = {
'query': '电影'
}
response = requests.get(url=url, params=param)
- data -> 指定post请求的参数
import requests
url = 'https://fanyi.baidu.com/sug'
data = {
'kw': '狗'
}
response = requests.post(url=url, data=data)
- headers -> 设置请求头参数
import requests
url = 'https://www.sogou.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
3. 响应对象中的相关属性
- 获取爬取到的数据
- 响应对象.text -> 获取到的数据类型: 字符串
import requests
url = 'https://www.sogou.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
page_text = response.text
- 响应对象.content -> 获取到的数据类型: 二进制(即: bytes 类型)
import requests
url = 'https://www.sogou.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
page_text = response.content
- 响应对象.json() -> 获取到的数据类型: 字典
import requests
url = 'https://fanyi.baidu.com/sug'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'}
data = {'kw': '狗'}
response = requests.post(url=url, data=data, headers=headers)
data_dic = response.json()
- 获取状态码
- 响应对象.status_code
import requests
url = 'https://www.sogou.com/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'}
response = requests.get(url=url, headers=headers)
status_code = response.status_code
print(status_code) # 200
爬虫案例
1. 爬取搜狗网页
import requests
url = 'https://www.sogou.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
page_text = response.text
with open('./sogou.html', 'w', encoding='utf-8') as f:
f.write(page_text)
2. 爬取搜狗指定词条搜索后的页面数据
import requests
url = 'https://www.sogou.com/web'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'
}
wd = input('请输入你要搜索的内容:')
param = {
'query': wd
}
response = requests.get(url=url, params=param)
page_text = response.content
file_name = '%s.html' % wd
with open(file_name, 'wb') as f:
f.write(page_text)
3. 爬取百度翻译结果
import requests
url = 'https://fanyi.baidu.com/sug'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'
}
wd = input('请输入你要翻译的内容:')
data = {
'kw': wd
}
response = requests.post(url=url, data=data, headers=headers)
print(response.json())
4. 爬取豆瓣电影分类排行榜中的电影详情数据
import requests
url = 'https://movie.douban.com/j/chart/top_list'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'
}
params = {
'type': 5,
'interval_id': '100:90',
'action': '',
'start': 20,
'limit': 20,
}
response = requests.get(url=url, params=params, headers=headers)
print(response.json())
5. 爬取国家药品监督管理总局中基于中华人民共和国化妆品生产许可证相关数据
import requests
url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'}
id_list = []
# 获取公司的id
for page in range(1, 11):
data = {
'on': 'true',
'page': page,
'pageSize': 15,
'productName': '',
'conditionType': 1,
'applyname': '',
'applysn': '',
}
response = requests.post(url=url, data=data, headers=headers)
json_data = response.json()
for data in json_data['list']:
id = data['ID']
id_list.append(id)
# 通过公司id查询公司的详情
detail_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'
for id in id_list:
detail_data = {
'id': id
}
detail_response = requests.post(url=detail_url, data=detail_data, headers=headers)
detail_json = detail_response.json()
print(detail_json)
6. 爬取图片
import requests
url = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1575719684752&di=478b3ebe5d21cac01d47ab26302811d4&imgtype=0&src=http%3A%2F%2Fn.sinaimg.cn%2Ffront%2F266%2Fw640h426%2F20181010%2F5O8K-hkrzyan5860450.jpg'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'}
img_data = requests.get(url=url, headers=headers).content
with open('./哈士奇.jpg', 'wb') as f:
f.write(img_data)
7. 使用正则批量爬取图片
import requests
import re
import os
from urllib.request import urlretrieve
url = 'https://www.hahamx.cn/pic/new/%d'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'
}
if not os.path.exists('./imgs'): # 判断是否存在该文件夹
os.mkdir('./imgs') # 创建文件夹
# 输入爬取第几页到第几页的图片
start_page = int(input('请输入起始页码:'))
end_page = int(input('请输入结束页码:'))
for page in range(start_page, end_page + 1):
new_url = url % page # 拼接网址
page_text = requests.get(url=new_url, headers=headers).text # 获取网页内容
img_data_list = re.findall('<div class="joke-main-content clearfix">.*?class="clearfix joke-main-gif-suspend joke-main-img-wrapper joke-main-img-gif-wrapper".*?data-path="(.*?)" data-filename="(.*?)">', page_text, re.S) # 获取图片网址的相关参数
for img_data in img_data_list:
img_url = 'https://image.hahamx.cn/%s/normal/%s' % img_data # 拼接图片网址
img_path = './imgs/' + img_data[1] # 设置图片保存路径
urlretrieve(url=img_url, filename=img_path) # 保存图片
print(img_path, '下载成功!')
print('over!!')